by Nelson Cárdenas
Customer segmentation (or market segmentation) are techniques to split customers into clusters based on similarities to get a sense of their behavior. In this notebook, we are going to analyze patterns in the Online Retail Data Set from the UCI Machine Learning Repository. As UCI specifies about their dataset:
Columns description
This notebook is based on work made by some authors, see section "References"
import pandas as pd
import plotly.express as px
from datetime import timedelta
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import math
sns.set_theme()
import warnings
warnings.filterwarnings("ignore")
We load the dataset and describe the type of data
pd.__version__
df_ori = pd.read_excel(io='Online Retail.xlsx')
df_copy = df_ori.copy()
df_ori[:3]
def describe(df, pred=None):
"""this function describes a dataframe basic information"""
obs = df.shape[0]
types = df.dtypes
counts = df.apply(lambda x: x.count())
uniques = df.apply(lambda x: [x.unique()])
nulls = df.apply(lambda x: x.isnull().sum())
distincts = df.apply(lambda x: x.unique().shape[0])
missing_ratio = (df.isnull().sum()/ obs) * 100
skewness = df.skew()
kurtosis = df.kurt()
print('Data shape:', df.shape)
if pred is None:
cols = ['types', 'counts', 'distincts', 'nulls', 'missing ratio', 'uniques', 'skewness', 'kurtosis']
output = pd.concat([types, counts, distincts, nulls, missing_ratio, uniques, skewness, kurtosis], axis = 1, sort=True)
else:
corr = df.corr()[pred]
output = pd.concat([types, counts, distincts, nulls, missing_ratio, uniques, skewness, kurtosis, corr], axis = 1, sort=True)
corr_col = 'corr ' + pred
cols = ['types', 'counts', 'distincts', 'nulls', 'missing ratio', 'uniques', 'skewness', 'kurtosis', corr_col ]
output.columns = cols
dtypes = output.types.value_counts()
print('___________________________\nData types:\n\n',output.types.value_counts())
print('___________________________')
return output
details = describe(df_ori)
display(details.sort_values(by='missing ratio', ascending=False))
df_ori.describe()
For Quantity and Unit price we have negative values, this can be caused by product returns. We must avoid these values, but first we are going to look at the data
df_ori = df_copy.copy()
bool_error = (df_ori['Quantity']<=0) | (df_ori['UnitPrice']<=0)
print('With Quantity negative and UnitPrice negative:', df_ori[(df_ori['Quantity']<0) & (df_ori['UnitPrice']<0)].shape[0])
print('With Quantity or UnitPrice negative or equals to zero:', df_ori[bool_error].shape[0])
print('Values errors by percentage:', str(df_ori[bool_error].shape[0]/df_ori.shape[0]*100)[:5], '%')
print('CustomerIDs with these anomalous values are:', df_ori[bool_error]['CustomerID'].unique())
Values without customer information or with zero or negative quantity opr price will be removed
df_ori = df_ori[~(bool_error | df_ori['CustomerID'].isnull())]
details = describe(df_ori)
display(details.sort_values(by='distincts', ascending=False))
df_ori.describe()
stock_desc = df_ori.groupby(['StockCode','Description']).count().reset_index()
stock_desc_count = stock_desc['StockCode'].value_counts().reset_index()
stock_desc_count[stock_desc_count['StockCode']>1][:2]
df_ori[df_ori['StockCode'] == 23196]['Description'].unique()
temp = df_ori['Description'].groupby(df_ori['StockCode']).unique().apply(pd.Series)
temp = temp[0].to_dict()
df_ori['Description'] = df_ori['StockCode'].map(temp)
df_ori['CustomerID'] = df_ori['CustomerID'].astype('int32')
df_ori[df_ori['StockCode'] == 23196]['Description'].unique()
We validate the result with the describe function
details = describe(df_ori)
display(details.sort_values(by='distincts', ascending=False))
We analyzed plotting of sales with respect to some of the other variables.
df_ori['Internal'] = 'No'
df_ori.loc[df_ori['Country'] == 'United Kingdom', 'Internal'] = 'Yes'
fig = px.pie(df_ori, names='Internal', title='Market representation', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()
df_ori['Amount'] = df_ori['UnitPrice']*df_ori['Quantity']
temp = pd.DataFrame(df_ori.groupby('Country')['Amount'].sum()).reset_index().sort_values(by=['Amount'], ascending=False)
fig = px.bar(temp, x='Country', y='Amount', title='Amount sales by country', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()
temp = df_ori.groupby('CustomerID')[['Amount']].sum().sort_values(by=['Amount'], ascending=False)
ratio_sales_inplot = str(list(temp[:50].sum())[0] / list(temp.sum())[0] * 100)[:5] + ' %'
fig = px.bar(temp[:50].reset_index(), x='CustomerID', y='Amount', title='50 Best Customers by amount ('+ ratio_sales_inplot + ' of total amount of sales)', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(xaxis_type = 'category')
fig.show()
ratio_sales_inplot = str(list(temp[:10].sum())[0] / list(temp.sum())[0] * 100)[:5] + ' %'
fig = px.bar(temp[:10].reset_index(), x='CustomerID', y='Amount', title='10 Best Customers by amount ('+ ratio_sales_inplot + ' of total amount of sales)', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(xaxis_type = 'category')
fig.show()
temp = df_ori.groupby('CustomerID')[['Amount']].count().sort_values(by=['Amount'], ascending=False)
temp
ratio_sales_inplot = str(list(temp[:10].sum())[0] / list(temp.sum())[0] * 100)[:4] + ' %'
fig = px.bar(temp[:10].reset_index(), x='CustomerID', y='Amount', title='10 Best Customers by frecuency of sales ('+ ratio_sales_inplot + ' of total frequency of sales)', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(xaxis_type = 'category')
fig.show()
temp = df_ori.groupby(['StockCode', 'Description'])[['Amount', 'Description']].sum().sort_values(by=['Amount'], ascending=False).reset_index()
ratio_sales_inplot = str(list(temp[['Amount']][:10].sum())[0] / list(temp[['Amount']].sum())[0] * 100)[:5] + ' %'
fig = px.bar(temp[:10].reset_index(), x='Description', y='Amount', title='10 best products by amount ('+ ratio_sales_inplot + ' of total amount of sales)', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(xaxis_type = 'category')
fig.update_traces(marker_color='rgb(246,207,113)')
fig.show()
temp = df_ori.groupby(['StockCode', 'Description'])[['Amount']].count().sort_values(by=['Amount'], ascending=False).reset_index()
ratio_sales_inplot = str(list(temp[['Amount']][:10].sum())[0] / list(temp[['Amount']].sum())[0] * 100)[:5] + ' %'
fig = px.bar(temp[:10].reset_index(), x='Description', y='Amount', title='10 best products by frequency ('+ ratio_sales_inplot + ' of total frequency of sales)', color_discrete_sequence=px.colors.qualitative.Pastel)
fig.update_layout(xaxis_type = 'category')
fig.update_traces(marker_color='rgb(246,207,113)')
fig.show()
We are going to use the Recency, Frequency, Monetary Model (RFM). As stated in the Wikipedia page, RFM stands for the three dimensions:
This analysis must be careful with the time window because can be biased or inaccurate if we try to span an extremely long duration.
snapshot_date = df_ori['InvoiceDate'].max() + timedelta(days=1)
print(snapshot_date)
#Extract features for each customer
data_process = df_ori.groupby(['CustomerID']).agg({
'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
'InvoiceNo': 'count',
'Amount': 'sum'})
#renaming
data_process.columns = ['Recency', 'Frequency', 'MonetaryValue']
data_process = data_process
data_process[:3]
Let's going to look at the distribution of our data
fig, axes = plt.subplots(1, 3, figsize=(22, 5))
for i, feature in enumerate(list(data_process.columns)):
sns.distplot(data_process[feature], ax=axes[i])
The data is skewed. Using log transformation we can improve the quality of the data for future analysis
data_process['Recency_log'] = data_process['Recency'].apply(math.log)
data_process['Frequency_log'] = data_process['Frequency'].apply(math.log)
data_process['MonetaryValue_log'] = data_process['MonetaryValue'].apply(math.log)
data_process[:3]
fig, axes = plt.subplots(1, 3, figsize=(22, 5))
for i, feature in enumerate(list(data_process.columns[3:])):
sns.distplot(data_process[feature], ax=axes[i])
Now let's normalized our data, this improves the k-means performance. We choose to use MinMaxScaler and changed the column names, then we use describe to analyze the results.
scaler = MinMaxScaler()
#scaler = StandardScaler()
data_process_normalized = pd.DataFrame(scaler.fit_transform(data_process))
#renaming
data_process_normalized.columns = ['n_'+ i for i in data_process.columns]
data_process_normalized.describe()
Now we are going to plot the inertia for each cluster from 1 to 15
SSE, max_k = [], 15
list_input = list(data_process_normalized.columns[3:])
for k in range(max_k):
kmeans = KMeans(n_clusters=k+1, random_state=42).fit(data_process_normalized[list_input])
SSE.append(kmeans.inertia_)
fig = go.Figure(data=go.Scatter(x=list(range(1, max_k+1)), y=SSE, ))
fig.update_traces(marker_size=14)
fig.show()
Using the elbow heuristic We decide to use k = 5 for our model.
model = KMeans(n_clusters=5, random_state=42).fit(data_process_normalized[list_input])
data_process_normalized['cluster'] = model.predict(data_process_normalized[list_input])
fig = px.scatter_3d(data_process_normalized, x=list_input[0], y=list_input[1], z=list_input[2],
color='cluster')
fig.show()
data_process_normalized[:3]
data_process_normalized.groupby('cluster').agg({
'n_Recency': ['mean', 'min', 'max'],
'n_Frequency': ['mean', 'min', 'max'],
'n_MonetaryValue': ['mean', 'min', 'max']
})
With this, we are done. We can define our clients based on the mean and range of values in each cluster. Further analysis includes using scorings based on RFM ranges. Also, we can use more robust analysis for the clustering, using not only RFM but other metrics such as demographics or product features.